data_df=read.csv("data/cdi.csv") %>% 
  mutate(CRM_1000=crimes/pop*1000) %>%  ## add new variable CRM_1000(the crime rate per 1,000 population) 
  select(-crimes,-pop) %>% ## Since new variable CRM_1000=crimes/pop*1000, we do not consider these two variables(crimes and pop) any more
  select(-id) %>% 
  mutate(region=as.factor(region))

data_df %>% skimr::skim_without_charts()
Data summary
Name Piped data
Number of rows 440
Number of columns 15
_______________________
Column type frequency:
character 2
factor 1
numeric 12
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
cty 0 1 3 8 0 371 0
state 0 1 2 2 0 48 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
region 0 1 FALSE 4 3: 152, 2: 108, 1: 103, 4: 77

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
area 0 1 1041.41 1549.92 15.0 451.25 656.50 946.75 20062.00
pop18 0 1 28.57 4.19 16.4 26.20 28.10 30.02 49.70
pop65 0 1 12.17 3.99 3.0 9.88 11.75 13.62 33.80
docs 0 1 988.00 1789.75 39.0 182.75 401.00 1036.00 23677.00
beds 0 1 1458.63 2289.13 92.0 390.75 755.00 1575.75 27700.00
hsgrad 0 1 77.56 7.02 46.6 73.88 77.70 82.40 92.90
bagrad 0 1 21.08 7.65 8.1 15.28 19.70 25.33 52.30
poverty 0 1 8.72 4.66 1.4 5.30 7.90 10.90 36.30
unemp 0 1 6.60 2.34 2.2 5.10 6.20 7.50 21.30
pcincome 0 1 18561.48 4059.19 8899.0 16118.25 17759.00 20270.00 37541.00
totalinc 0 1 7869.27 12884.32 1141.0 2311.00 3857.00 8654.25 184230.00
CRM_1000 0 1 57.29 27.33 4.6 38.10 52.43 72.60 295.99

each continuous virable distribution

boxplot for each continuous variable

par(mfrow=c(3,4))
boxplot(data_df$area,main='Land area measured in square miles')


boxplot(data_df$pop18,main='Percent of population aged 18-34')

boxplot(data_df$pop65,main='Percent of populationß aged 65+')

boxplot(data_df$docs,main='Number of active physicians')

boxplot(data_df$beds,main='Number of hospital beds')


boxplot(data_df$hsgrad,main='Percent high school graduates')

boxplot(data_df$bagrad,main='Percent bachelor’s degrees')

boxplot(data_df$unemp,main='Percent below poverty level')

boxplot(data_df$pcincome,main='Per capita income')

boxplot(data_df$totalinc,main='Total personal income')

boxplot(data_df$CRM_1000,main='the crime rate per 1,000 population')

relationship of two variables(pairs)

pair_df=
  data_df %>% 
    select(-cty,-state) 
    
pairs(pair_df)

## Correlation plot

cor_df=
  data_df %>% 
    select(-cty,-state,-region) %>% 
    cor() ## Correlation coefficient


corrplot(cor(cor_df), type = "upper", diag = FALSE)

相关系数:|r|<0.4为低度线性相关;0.4≤|r|<0.7为显著性相关;0.7≤|r|<1为高度线性相关

显著相关: pop18 vs bagrad 0.45609703
pop18 vs pop65 -0.616309639
hsgrad vs poverty -0.691750483 hsgrad vs unemp -0.593595788 hsgrad vs pcincome 0.52299613 bagrad vs unemp -0.540906913
bagrad vs pcincome 0.69536186 poverty vs bagrad -0.40842385 unemp vs poverty 0.436947236 pcincome vs poverty -0.60172504 CRM_100 VS poverty 0.471844218

高度线性相关: bed vs docs 0.950464395 hsgrad vs bagrad 0.70778672 totalinc vs docs 0.948110571
totalinc vs beds 0.902061545

Since correlation coefficient of bed and docs is the highest, draw a scatterpoint plot of these two variables

data_df %>% 
  ggplot(aes(beds,docs)) + geom_point()

data_df %>% 
  ggplot(aes(totalinc,docs)) + geom_point()

check if some variables approximate to normal distribution

qqnorm(data_df$CRM_1000) #非常接近线性

qqnorm(data_df$area)

qqnorm(data_df$pop18) #接近线性

qqnorm(data_df$pop65)

qqnorm(data_df$docs)

qqnorm(data_df$beds)

qqnorm(data_df$hsgrad) #接近线性

qqnorm(data_df$bagrad) 

qqnorm(data_df$poverty) 

qqnorm(data_df$unemp) 

qqnorm(data_df$pcincome)  #接近线性

qqnorm(data_df$totalinc)
qqnorm(data_df$totalinc)

## establish a simple linear regression of CRM_1000 VS poverty

slr1 = lm(CRM_1000~poverty,data=data_df)
summary(slr1)
## 
## Call:
## lm(formula = CRM_1000 ~ poverty, data = data_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.008 -14.578  -2.561  13.605 208.853 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  33.1390     2.4435   13.56   <2e-16 ***
## poverty       2.7690     0.2472   11.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.12 on 438 degrees of freedom
## Multiple R-squared:  0.2226, Adjusted R-squared:  0.2209 
## F-statistic: 125.4 on 1 and 438 DF,  p-value: < 2.2e-16
anova(slr1)
## Analysis of Variance Table
## 
## Response: CRM_1000
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## poverty     1  72991   72991  125.44 < 2.2e-16 ***
## Residuals 438 254856     582                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(slr1)

fit regression using all predictors(except ID,cty,state)

data_df=data_df %>% 
  select(-cty,-state)
mult.fit = lm(CRM_1000 ~ ., data = data_df)
summary(mult.fit)
## 
## Call:
## lm(formula = CRM_1000 ~ ., data = data_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.880 -10.126  -1.627   9.724 193.039 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.043e+01  2.973e+01  -2.705 0.007103 ** 
## area        -5.519e-04  7.300e-04  -0.756 0.450046    
## pop18        1.368e+00  3.527e-01   3.879 0.000121 ***
## pop65        1.884e-01  3.171e-01   0.594 0.552716    
## docs        -7.066e-03  2.498e-03  -2.829 0.004896 ** 
## beds         1.174e-02  1.567e-03   7.490 4.03e-13 ***
## hsgrad       2.076e-01  2.930e-01   0.708 0.479090    
## bagrad      -3.362e-01  3.191e-01  -1.054 0.292702    
## poverty      2.372e+00  4.009e-01   5.915 6.83e-09 ***
## unemp        2.234e-01  5.650e-01   0.395 0.692742    
## pcincome     2.487e-03  5.021e-04   4.954 1.05e-06 ***
## totalinc    -6.745e-04  2.629e-04  -2.566 0.010632 *  
## region2      8.505e+00  2.957e+00   2.877 0.004221 ** 
## region3      2.493e+01  2.895e+00   8.610  < 2e-16 ***
## region4      2.305e+01  3.663e+00   6.294 7.72e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.43 on 425 degrees of freedom
## Multiple R-squared:  0.5107, Adjusted R-squared:  0.4946 
## F-statistic: 31.68 on 14 and 425 DF,  p-value: < 2.2e-16
broom::tidy(mult.fit)
## # A tibble: 15 × 5
##    term          estimate std.error statistic  p.value
##    <chr>            <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept) -80.4      29.7         -2.71  7.10e- 3
##  2 area         -0.000552  0.000730    -0.756 4.50e- 1
##  3 pop18         1.37      0.353        3.88  1.21e- 4
##  4 pop65         0.188     0.317        0.594 5.53e- 1
##  5 docs         -0.00707   0.00250     -2.83  4.90e- 3
##  6 beds          0.0117    0.00157      7.49  4.03e-13
##  7 hsgrad        0.208     0.293        0.708 4.79e- 1
##  8 bagrad       -0.336     0.319       -1.05  2.93e- 1
##  9 poverty       2.37      0.401        5.92  6.83e- 9
## 10 unemp         0.223     0.565        0.395 6.93e- 1
## 11 pcincome      0.00249   0.000502     4.95  1.05e- 6
## 12 totalinc     -0.000675  0.000263    -2.57  1.06e- 2
## 13 region2       8.51      2.96         2.88  4.22e- 3
## 14 region3      24.9       2.89         8.61  1.44e-16
## 15 region4      23.1       3.66         6.29  7.72e-10

假设检验 正态性 qqplot

qqPlot(mult.fit,id.method='identify',simulate = TRUE,main='Q-Q plot')
<<<<<<< HEAD

=======

>>>>>>> 29c7401617081becc00fada919dfce6a190c81cc
## [1]   6 282

可以看到所有的点都在直线附近,并都落在置信区间内,这表明正态性假设符合得很完美

假设检验 独立性

进行D-W检验:

durbinWatsonTest(mult.fit)
##  lag Autocorrelation D-W Statistic p-value
<<<<<<< HEAD
##    1      0.06792391      1.857479    0.12
=======
##    1      0.06792391      1.857479   0.126
>>>>>>> 29c7401617081becc00fada919dfce6a190c81cc
##  Alternative hypothesis: rho != 0

P = 0.124 > 0.05不显著,说明因变量之间无自相关性,互相独立。

假设检验 线性关系 成分残差图

crPlots(mult.fit) 

成分残差图证实了线性假设,说明线性模型对该数据集是比较合适的。

同方差性

mult.fit = lm(CRM_1000 ~ ., data = data_df)

ncvTest(mult.fit)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 173.5754, Df = 1, p = < 2.22e-16

p = 2.037e-09 显著,说明误差方差不恒定。选用almost所有predictor的线性回归不满足同方差性 异方差性的出现意味着误差项的方差不恒定,这常常出现在有异常值(Outlier)的数据集上,如果使用标准的回归模型,这些异常值的重要性往往被高估。在这种情况下,标准差和置信区间不一定会变大还是变小。